Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30112
## Number of samples: 80 (45 ASD, 35 controls)
Significance criteria: adjusted p-value<0.05 and abs(lfc)<log2(1.2) (absolute value because we care both for the under and overexpressed genes)
datExpr = datExpr[DE_info$padj<0.05 & abs(DE_info$log2FoldChange)>log2(1.2),]
print(paste0(nrow(datExpr), ' probes left.'))
## [1] "1055 probes left."
Since there are more genes than samples, we can perform PCA and reduce the dimension from 30K to 80 without losing any information
pca = prcomp(t(datExpr))
datExpr_redDim = pca$x %>% data.frame
clusterings = list()
No recognisable best k, so chose k=6
set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 6
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=9 as best number of clusters.
h_clusts = datExpr_redDim %>% dist %>% hclust %>% as.dendrogram
h_clusts %>% plot
abline(h=39, col='blue')
best_k = 8
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Colors:
Diagnosis: Blue=control, Green=ASD
Sex: Pink=Female, Blue=Male
Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital
Age: Purple=youngest, Yellow=oldest
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_age = datMeta$Age %>% min
max_age = datMeta$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>%
mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'), # Pink Female, Blue Male
'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D', # ggplot defaults for 4 colours
Brain_lobe=='Temporal'~'#7CAE00',
Brain_lobe=='Parietal'~'#00BFC4',
Brain_lobe=='Occipital'~'#C77CFF'),
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Chose the best clustering to be with k=5
*The rest of the output plots can be found in the Data/Gandal/consensusClustering/samples_DE_probes folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance (The 2 first components explain over 60% of the variance, so decided to keep the first 37 to accumulate 90% of the variance)
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign obs to genes with FDR<0.01 using the fdrtool package
It’s not supposed to be a good method for clustering samples because ICA does not perform well with small samples (see Figure 4 of this paper)
Still, it leaves only 5 samples without a cluster
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 5 6 7 8
## 5 32 14 9 12 4 2 1 1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
This method doesn’t work well:
Using power=5 as it is the first power that exceed the 0.85 threshold
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = 1:30)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.3020 0.403 0.6280 24.9000 2.73e+01 38.600
## 2 2 0.2140 -0.229 0.0855 11.5000 1.16e+01 24.100
## 3 3 0.5790 -0.508 0.4590 6.2000 5.49e+00 16.400
## 4 4 0.7520 -0.705 0.6860 3.6800 2.77e+00 11.800
## 5 5 0.9150 -0.792 0.9190 2.3200 1.47e+00 8.780
## 6 6 0.9690 -0.919 0.9660 1.5400 8.02e-01 6.720
## 7 7 0.8810 -0.996 0.8740 1.0600 5.22e-01 5.270
## 8 8 0.1650 -2.170 -0.0512 0.7520 3.48e-01 4.210
## 9 9 0.1730 -2.210 -0.0502 0.5510 2.31e-01 3.420
## 10 10 0.8990 -1.090 0.8830 0.4150 1.53e-01 2.820
## 11 11 0.8660 -1.120 0.8520 0.3200 9.83e-02 2.360
## 12 12 0.8640 -1.040 0.8920 0.2530 6.16e-02 2.000
## 13 13 0.1610 -2.930 -0.0410 0.2050 3.89e-02 1.720
## 14 14 0.1590 -2.980 -0.0282 0.1690 2.56e-02 1.490
## 15 15 0.0724 -1.970 0.1410 0.1420 1.68e-02 1.300
## 16 16 0.1260 -2.800 0.1640 0.1220 1.10e-02 1.160
## 17 17 0.0748 -1.900 0.1760 0.1060 7.49e-03 1.090
## 18 18 0.0416 -1.480 0.4140 0.0937 5.06e-03 1.040
## 19 19 0.0461 -1.490 0.3990 0.0836 3.38e-03 0.985
## 20 20 0.0575 -1.580 0.3940 0.0755 2.28e-03 0.938
## 21 21 0.0123 -0.763 0.7130 0.0687 1.54e-03 0.895
## 22 22 0.0630 -1.650 0.4320 0.0631 1.07e-03 0.855
## 23 23 0.0715 -1.690 0.4130 0.0583 7.55e-04 0.818
## 24 24 0.0387 -1.190 0.3330 0.0541 5.35e-04 0.783
## 25 25 0.0434 -1.210 0.3310 0.0505 3.81e-04 0.750
## 26 26 0.0486 -1.240 0.3180 0.0474 2.70e-04 0.719
## 27 27 0.0537 -1.260 0.3070 0.0445 1.88e-04 0.690
## 28 28 0.0590 -1.270 0.3120 0.0420 1.31e-04 0.661
## 29 29 0.0634 -1.280 0.3050 0.0397 9.16e-05 0.635
## 30 30 0.0675 -1.280 0.2990 0.0376 6.41e-05 0.609
Running WGCNA with power=5
# Best power
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 0, 1
## ..there is nothing to merge.
Cluster distribution
table(network$colors)
##
## 0 1
## 18 62
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
Running WGCNA with power=10, leaves out even more genes and still only finds 1 cluster
# 2nd best
network = datExpr_redDim %>% t %>% blockwiseModules(power=10, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 0, 1
## ..there is nothing to merge.
Cluster distribution
table(network$colors)
##
## 0 1
## 43 37
Using the original expression dataset, the \(R^2\) threshold is never achieved, getting closest at power 1, but still doesn’t manage to find any clusters within the data
best_power = datExpr %>% pickSoftThreshold(powerVector = 1:30)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.664000 18.6000 0.78300 71.20 72.30 73.6
## 2 2 0.663000 9.2200 0.76000 64.30 66.20 68.8
## 3 3 0.080100 10.8000 -0.01770 58.30 60.80 64.4
## 4 4 0.070700 7.5500 -0.00744 53.00 56.00 60.5
## 5 5 0.616000 3.1100 0.76000 48.30 51.60 56.8
## 6 6 0.597000 2.5500 0.70100 44.20 47.70 53.5
## 7 7 0.580000 2.0700 0.73300 40.50 44.20 50.5
## 8 8 0.552000 1.8300 0.63300 37.20 41.00 47.6
## 9 9 0.107000 4.4700 -0.03060 34.30 38.10 45.0
## 10 10 0.382000 1.2000 0.40000 31.60 35.50 42.6
## 11 11 0.340000 1.0400 0.30100 29.20 33.00 40.4
## 12 12 0.362000 0.9000 0.37900 27.00 30.70 38.3
## 13 13 0.401000 0.7860 0.42700 25.00 28.50 36.4
## 14 14 0.362000 0.7360 0.32400 23.20 26.50 34.5
## 15 15 0.330000 0.6580 0.27100 21.60 24.70 32.8
## 16 16 0.194000 0.5290 0.09890 20.10 23.10 31.2
## 17 17 0.150000 0.5420 0.01710 18.70 21.60 29.7
## 18 18 0.120000 0.3700 0.01690 17.50 20.20 28.3
## 19 19 0.043500 0.2720 -0.06120 16.30 18.90 27.0
## 20 20 0.028600 0.1530 -0.20000 15.30 17.70 25.7
## 21 21 0.011900 0.0920 -0.22900 14.30 16.50 24.6
## 22 22 0.001150 0.0315 -0.28200 13.40 15.50 23.4
## 23 23 0.000775 0.0240 -0.28500 12.60 14.50 22.4
## 24 24 0.000578 0.0195 -0.28100 11.80 13.60 21.4
## 25 25 0.000829 0.0225 -0.27700 11.10 12.70 20.4
## 26 26 0.002890 -0.0462 -0.28200 10.40 11.90 19.5
## 27 27 0.015600 -0.0979 -0.25800 9.78 11.20 18.7
## 28 28 0.043000 -0.1710 -0.23000 9.20 10.50 17.9
## 29 29 0.060100 -0.2080 -0.20800 8.67 9.83 17.1
## 30 30 0.082400 -0.2300 -0.17400 8.17 9.23 16.4
Running WGCNA with power=1
# Best power
network = datExpr %>% blockwiseModules(power=1, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 1
## ..there is nothing to merge.
Cluster distribution
table(network$colors)
##
## 1
## 80
Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 groups following the best k from K-means because the methods are similar
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 6 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, best_k, ICA_output, ICA_clusters_names,
signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta,
best_power, c, viridis_age_cols, create_viridis_dict, pca)
Using Adjusted Rand Index:
K-means, Hierarchical Clustering Consensus Clustering and Gaussian Mixtures seem to give very similar clusterings (similar to the unfiltered version of this analysis)
ASD seems to be captured best by K-Means and Consensus Clustering (although it’s not that good)
ICA seems to be the best clustering to capture Age (although it’s not that good)
No clusterings were able to capture any type of Sex or Brain Region structure
clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['Subject']] = datMeta$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta$Diagnosis_
clusters_plus_phenotype[['Region']] = datMeta$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta$Sex
clusters_plus_phenotype[['Age']] = datMeta$Age
cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
cluster1 = as.factor(clusters_plus_phenotype[[i]])
for(j in (i):length(clusters_plus_phenotype)){
cluster2 = as.factor(clusters_plus_phenotype[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), Subject_ID = datMeta$Subject_ID,
KMeans = as.factor(clusterings[['km']]), Hierarchical = as.factor(clusterings[['hc']]),
Consensus = as.factor(clusterings[['cc']]), ICA = as.factor(clusterings[['ICA_min']]),
GMM = as.factor(clusterings[['GMM']]), WGCNA = as.factor(clusterings[['WGCNA']]),
Sex = as.factor(datMeta$Sex), Region = as.factor(datMeta$Brain_lobe),
Diagnosis = as.factor(datMeta$Diagnosis_), Age = datMeta$Age)
Now, PC1 seems to separate samples by Diagnosis relatively well
selectable_scatter_plot(plot_points, plot_points)